R/SIF - mkdown & analysis dev/sif-script.R

Defines functions trim.pop

#' purpose of this script is to scratch out where I'm at so far:
#'
#' Show the visualizations; do some models; play with different spatial scales;
#' do the SIF parameter optimization, etc.
#' But without getting too bogged down trying to do everywhere at once.

# setup ws ----------------------------------------------------------------

library(tidyverse)
library(sf)
library(statnet)
library(igraph)
library(tidygraph)
drop.dir <- Sys.getenv("drop_dir")
ergm.dir <-
  paste0(drop.dir,
         "graphs-and-ergms/")

cur.dir <- "R/SIF & mapping - mkdown & analysis dev"

devtools::load_all()

# get hwy data for visuals & transform to node crs
hwy.dir <-
  paste0(drop.dir, "shapefiles/nhpn/")
  #"/scratch/gpfs/km31/other-local-data/National_Highway_Planning_Network-shp/"
nhpn <-
  st_read(paste0(hwy.dir,
                 "National_Highway_Planning_Network.shp"
  ))
nhpn <- nhpn %>% divM::conic.transform()

# select city -------------------------------------------------------------

city.str <-
  "19700" #philly
#"24701" # st louis
(tmpr <- divM::get.region.identifiers(cz = city.str))


# load prepped graphs -----------------------------------------------------

# at cz & place levels
czdir <- paste0(ergm.dir, "cz-graphs/")
plcdir <- paste0(ergm.dir, "plc-graphs/")

list.files(czdir)
list.files(plcdir)
#czgh <- list.files(czdir, pattern = city.str, full.names = T) %>% readRDS()
plcgh <- list.files(plcdir, pattern = city.str, full.names = T) %>% readRDS()


# little additions --------------------------------------------------------

# i had formed without trimming by population, just to allow max flexibility
# later
trim.pop <- function(gh, pop.floor = 500){
  gh %>%
    activate("nodes") %>%
    filter(pop > pop.floor)}

plcgh <- plcgh %>% trim.pop

# reattach geois
plcgh <- plcgh %>% spatialize.nodes()

# get visuals -------------------------------------------------------------

plcplot <-
  Wrapper_make.graph.map(plcgh,
                         tie.str.floor = 5e-3,
                         flow.colm = "binned.tie.strength"
                         ,trim2plc = T
                         ,n_breaks = 3
                         ,digits = 5)

plcplot

# everything for cz or plc  -------------------------------------------------

#gh <- czgh
gh <- plcgh

# some setup --------------------------------------------------------------

# spatial nodes
sfn <- gh %>% get_nodes() %>% st_sf()

# spatial edges
sfe <-  gh %>%
  activate("edges") %>%
  sfnetworks::as_sfnetwork(., directed = T,
                           edges_as_lines = T) %>%
  get_edges() %>%
  st_sf()

# edge distances
sfe$dst <- st_length(sfe$geometry)
sfe$dst <- as.numeric(sfe$dst) / 1000 # to km

# pairwise (tractxtract) distance matrix
dst.mat <- build.pairwise.dst.matrix(sfn)
# max km span
max(dst.mat)

# do spatial interaction function analysis ------------------------------------

# binned relative frequencies (# of ties over distance, relative to crosswise
# distance distribution)
dst.freq <- relative.tie_dst.frequency(
  sfn, sfe, dst.mat = dst.mat,
  n.bins = 60)

dst.freq %>%
  ggplot(aes(x = dst.lvl,
             y = avg.flow)) +
  geom_bar(stat = "identity")

# log-log'd
dst.freq %>%
  ggplot(aes(x = log(dst.lvl),
             y = log(avg.flow))) +
  geom_path()
# this pattern is common for this and shows up in the Daraganova paper and all
# the places I've looked at at any level. It shows the the "power law"
# distribution holds where distance < threshold, and then breaks down.
exp(3)

#' trim to before when power law distribution begins breaking down
ghT <- gh %>%
  activate("edges") %>%
  mutate(dst = sfe$dst) %>%
  filter(dst < 20) #50)

dst.freq <- relative.tie_dst.frequency(
  gh=ghT,
  n.bins = 60)

dst.freq %>%
  ggplot(aes(x = dst.lvl,
             y = avg.flow)) +
  geom_bar(stat = "identity")

# log-log'd, with distant trips trimmed
dst.freq %>%
  ggplot(aes(x = log(dst.lvl),
             y = log(avg.flow))) +
  geom_path()


# estimate parameters for SIF ---------------------------------------------

#' these steps were kinda confusing to me. the `igraph` package has
#' power.law.fit, and another package called `poweRlaw` dedicated to fitting
#' power law distributions, which cites some literature about doing this. Then
#' the Daraganova and Butts and Acton papers do a slightly different thing, with
#' no clear implementation in R. I played around and was kinda confused about
#' this, but I end up (I think) implementing the Daraganova and Butts and Acton
#' strategy, which is to get a general distribution (power law), and then use
#' maximum-likelihood estimation (MLE) to fit the parameters. I use the tutorial
#' here to do the MLE for the power law distribution

# below relies on the `dst.freq` in the general environment
param.fitting <-
  stats::optim(par= c(1,1.7,1, 10),
               fn = power.law.loss.fcn, hessian = T)


dst.freq$flow.hat <-
  power.law(dst.freq$dst.lvl,
            param.fitting$par[1],
            param.fitting$par[2],
            param.fitting$par[3]
  )

dst.freq %>%
  select(dst.lvl, avg.flow, flow.hat) %>%
  pivot_longer(cols = contains("flow"),
               values_to = "avg.flow") %>%
  ggplot() +
  geom_path(aes(color = name,
                y = avg.flow,
                x = dst.lvl))

#so there's the general distribution, and we can use those estimates to
#parameterize the SIF in the ERGM
decay.mat <-
  dst.mat %>%
  power.law(
    1, #param.fitting$par[1],
    param.fitting$par[2],
    param.fitting$par[3]
  )



# run an ergm -------------------------------------------------------------
library(statnet)

# to trim ties to save time while hashing out code
sfe$tie.strength %>% quantile(seq(0,1,.05))
ghT <-
  ghT %>%
  activate("edges") %>%
  filter(tie.strength > 1e-3)

# erg model specification
fo <-
  formula(
    ergh ~ edges +
      nodematch("int.poly", diff = FALSE) +
      # nodematch("in.cc") +
      edgecov(decay.mat, "proximity") +
      nodecov("ln.pop.dens", form="sum") +
      absdiff("perc.wh", pow=1) +
      absdiff("perc.hsp", pow=1) +
      absdiff("hh.median.income", pow=1)
  )


# switch to statnet libraries
ergh <<- ergm.clean.n.convert(ghT)

# set seed
set.seed(1234)

# run model
mo <- ergm(
  fo,
  response = "tie.strength",
  reference = ~Poisson
)

mo
summary(mo)

# same ws
save.image(
  paste0(cur.dir,
         "19700",
         "-philadelphia-cz-analysis-env.RData"))


# plotting, facetted by distance ------------------------------------------

# ( i think for most areas, this is only interesting at Place/county level-- CZ
# will be often too zoomed out)

#' Finally, let's facet the plots above by distance bins:
#'
#' 1) Very close; less than a mile (~1.6km)
#'
#' 2) kinda close; less than 5 mi (~8km)
#'
#' 3) below 50 km, around when power law distribution begins breaking down
#'
#' 4) >50km
spatial.brks <-
  c(0, 1.6, 8, 50,1e9)

agh <- ghT %>%
  activate("edges") %>%
  mutate(spatial.scale = cut(dst,
                             spatial.brks,
                             include.lowest = T))

# make graph outside of Wrapper to play with more

# gotta do it outside of wrapper to facet
p <-
  Wrapper_make.graph.map(agh,
                         tie.str.floor = 5e-3,
                         flow.colm = "binned.tie.strength"
                         ,trim2plc = F
                         ,n_breaks = 3
                         ,digits = 5)
p +
  facet_wrap(~spatial.scale)
kmcd39/divflow documentation built on Dec. 21, 2021, 7:38 a.m.